import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# import plotly.io as pio
# pio.renderers.default = "png"
# function used in case we would like to select desired timestamp
def trunc_df(df):
start = pd.to_datetime('2020-07-01 19:30:00')
end = pd.to_datetime('2020-08-01 19:30:00')
ret_df = df.loc[start:end]
return ret_df
# to always have the newest plot versions, delete file before creating new one
def remove_file_if_exists(file_path):
if os.path.exists(file_path):
os.remove(file_path)
# svd global variables
SVD_FOLDER_PATH = 'svd_plots'
SVD_RECONSTUCTION_FOLDER_PATH = f'{SVD_FOLDER_PATH}/reconstuction'
SVD_COLUMN = 'tempC'
# recreate directories
if not os.path.exists(SVD_FOLDER_PATH):
print(f'Creating folder {SVD_FOLDER_PATH}')
os.mkdir(SVD_FOLDER_PATH)
if not os.path.exists(SVD_RECONSTUCTION_FOLDER_PATH):
print(f'Creating folder {SVD_RECONSTUCTION_FOLDER_PATH}')
os.mkdir(SVD_RECONSTUCTION_FOLDER_PATH)
# function to create csv file used for analysis
def create_svd_df(column_value):
FULL_DATA_FILENAME = 'data.csv.gz'
OUTPUT_FILENAME = f'data_svd_{column_value}.csv'
columns_to_read = ['date_time', 'city', column_value]
df = pd.read_csv(FULL_DATA_FILENAME, usecols=columns_to_read, compression='gzip')
unique_towns = sorted(list(df['city'].unique()))
new_index = pd.date_range(start=df.iloc[0]['date_time'],
end=df.iloc[-1]['date_time'],
freq='3h'
)
data = np.array(df[column_value])
data = data.reshape(len(new_index), len(unique_towns))
ret_df = pd.DataFrame(data=data, columns=unique_towns, index=new_index, dtype=float)
ret_df.to_csv(OUTPUT_FILENAME)
GEOLOCATION_FILENAME = 'geo_position.csv'
df_geolocation = pd.read_csv(GEOLOCATION_FILENAME).sort_values(by='CITY')
unique_towns = sorted(list(df_geolocation['CITY'].unique())) # get unique names of towns ordered by name
print(f'Doing analisys for {SVD_COLUMN}')
SVD_DATA_FILENAME = f'data_svd_{SVD_COLUMN}.csv'
if not os.path.exists(SVD_DATA_FILENAME):
print(f'Creating svd file for {SVD_COLUMN}')
create_svd_df(SVD_COLUMN)
print('File created!')
Doing analisys for tempC
print(f'Loading svd file for {SVD_COLUMN}')
SVD_DATA_FILENAME = f'data_svd_{SVD_COLUMN}.csv'
df_svd = pd.read_csv(SVD_DATA_FILENAME, index_col=0)
df_svd.index = pd.to_datetime(df_svd.index)
# df_svd = trunc_df(df_svd)
svd_A = np.array(df_svd)
# build matrix U, S, V
svd_U, svd_S, svd_V = np.linalg.svd(svd_A, full_matrices=False)
Loading svd file for tempC
# function to calculate and plot precision of svd reconstruction
def svd_precision(svd_S):
SVD_PRECISION_FILENAME = f'{SVD_FOLDER_PATH}/{SVD_COLUMN}_svd_precision.png'
remove_file_if_exists(SVD_PRECISION_FILENAME)
fig = go.Figure(data=[go.Bar(x=np.arange(np.size(svd_S)),
y=np.cumsum(svd_S / np.sum(svd_S))
)])
fig.update_layout(title_text='SVD Precision', title_x=0.5,
xaxis_title='Rank', yaxis_title='Precision')
fig.update_yaxes(range=[0.5, 1])
fig.write_image(SVD_PRECISION_FILENAME)
fig.show()
# Calculating svd reconstruction precision
svd_precision(svd_S)
# full reconstruction - matrix svd_Ar
svd_Ar = np.dot(svd_U * svd_S, svd_V)
print(f'Diff: {np.mean(np.abs(svd_A - svd_Ar))}')
# lower rank reconstruction - matrix svd_Ar
k = 5
svd_Ar = np.dot(svd_U[:,:k] * svd_S[:k], svd_V[:k, :])
print(f'Diff reconstruction: {np.mean(np.abs(svd_A - svd_Ar))}')
Diff: 1.5364039972083385e-14 Diff reconstruction: 0.25542727208937854
# function to calculate and plot average error of svd for k=n
def svd_average_error(svd_A, svd_Ar, k, unique_towns):
SVD_AVG_ERR_FILENAME = f'{SVD_FOLDER_PATH}/{SVD_COLUMN}_svd_avg_err.png'
remove_file_if_exists(SVD_AVG_ERR_FILENAME)
svd_err = np.average(np.abs(svd_A - svd_Ar), axis=0)
asix_range = np.arange(0, len(unique_towns))
fig = go.Figure(data=[go.Bar(x=asix_range,
y=svd_err
)])
fig.update_layout(title_text='SVD Average Error', title_x=0.5,
yaxis_title=f'Average error of reconstruction with rank k={k}',
xaxis=dict(tickmode='array',
tickvals=asix_range,
ticktext=unique_towns
)
)
fig.update_xaxes(tickangle=90)
fig.write_image(SVD_AVG_ERR_FILENAME)
fig.show()
# Calculating average error
svd_average_error(svd_A, svd_Ar, k, unique_towns.copy())
# function to plot dates to concept for k=n
def svd_dates_to_concept(k, index, svd_U):
SVD_DTC_FILENAME = f'{SVD_FOLDER_PATH}/{SVD_COLUMN}_svd_dates_to_concept.png'
remove_file_if_exists(SVD_DTC_FILENAME)
all_plots = []
for i in range(k):
all_plots.append(go.Scatter(x=index, y=svd_U[:, i], name=f'k={i}'))
fig = go.Figure(data=all_plots)
fig.write_image(SVD_DTC_FILENAME)
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
fig.show()
# dates to concept
svd_dates_to_concept(k, df_svd.index, svd_U)
# function to plot towns to concept for k=n
def svd_towns_to_concept(k, svd_V, unique_towns):
SVD_TTC_FILENAME = f'{SVD_FOLDER_PATH}/{SVD_COLUMN}_svd_towns_to_concept.png'
remove_file_if_exists(SVD_TTC_FILENAME)
asix_range = np.arange(0, len(unique_towns))
all_plots = []
for i in range(k):
all_plots.append(go.Bar(x=asix_range, y=svd_V[i, :], name=f'{i}'))
fig = go.Figure(data=all_plots)
fig.update_layout(title_text=f'Towns to Concept for k={k}', title_x=0.5,
xaxis=dict(tickmode='array',
tickvals=asix_range,
ticktext=unique_towns
)
)
fig.update_xaxes(tickangle=90)
fig.write_image(SVD_TTC_FILENAME)
fig.show()
# towns to concept
svd_towns_to_concept(k,svd_V, unique_towns.copy())
# plot map with values from SVD_V (towns to concept)
def plot_svd_map(unique_towns, vector, k, data_geo):
SVD_MAP_FILENAME = f'{SVD_FOLDER_PATH}/{SVD_COLUMN}_svd_map_k{k}.png'
remove_file_if_exists(SVD_MAP_FILENAME)
data_geo['VALUES'] = vector
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(data_geo, lat="LAT", lon="LNG",
color="VALUES", hover_name="CITY",
color_continuous_scale=px.colors.cyclical.Phase)
fig.write_image(SVD_MAP_FILENAME)
fig.show()
# plot maps
for i in range(k):
print(f'Ploting map for k = {i}')
plot_svd_map(unique_towns, svd_V[i, :], i, df_geolocation.copy())
Ploting map for k = 0
Ploting map for k = 1
Ploting map for k = 2
Ploting map for k = 3
Ploting map for k = 4
# reconstruct temperatures of unique towns using svd
def svd_reconstruct(unique_towns, df_svd, svd_A, svd_U, svd_S, svd_V, k):
print(f'Saving reconstructins to {SVD_RECONSTUCTION_FOLDER_PATH}')
for iloc, location in enumerate(unique_towns):
SVD_RESONSTRUCTION_FILENAME = f'{SVD_RECONSTUCTION_FOLDER_PATH}/{SVD_COLUMN}_svd_reconstruction_plot_{location}.png'
remove_file_if_exists(SVD_RESONSTRUCTION_FILENAME)
fig = plt.figure(figsize=(20, 10), tight_layout=True)
legend_handles = []
legend_labels = []
plt_orig, = plt.plot(df_svd[location], marker='o', ls='', c='r', ms=1)
legend_handles.append(plt_orig)
legend_labels.append('Original data')
a_cum = np.zeros(svd_A.shape[0])
for i in range(k):
a_k = np.dot(svd_U[:, i] * svd_S[i], svd_V[i, iloc])
flbtw_k = plt.fill_between(df_svd.index, a_cum, a_cum + a_k, alpha=0.3, label=f'k= {i}')
legend_handles.append(flbtw_k)
legend_labels.append(f'k= {i}')
a_cum += a_k
plt_recon, = plt.plot(df_svd.index, a_cum, marker='s', ls='--', c='b', lw=1, ms=1)
legend_handles.append(plt_recon)
legend_labels.append('Reconstruction')
plt.legend(legend_handles, legend_labels)
plt.ylim(df_svd[location].min() - 2, df_svd[location].max() + 2)
fig.savefig(SVD_RESONSTRUCTION_FILENAME, dpi=90)
if location == 'Rijeka':
plt.title(f'Reconstruction for {location}')
plt.show()
plt.close(fig)
# SVD reconstruction temperature
svd_reconstruct(unique_towns, df_svd, svd_A, svd_U, svd_S, svd_V, k)
Saving reconstructins to svd_plots/reconstuction
# save notebook before nbconvert
import IPython
%%javascript
IPython.notebook.save_notebook()
# export notebook results to HTML
!jupyter nbconvert --to=HTML svd.ipynb
# # export notebook results to pdf
# !jupyter nbconvert --to=pdf correlation.ipynb